import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import fbprophet
from fbprophet import Prophet
import os
from scipy import stats
import sklearn
import tslearn
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.utils import to_time_series_dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(os.listdir())
df = pd.read_csv('Sales_Data_.csv', index_col = 0)
df.head()
df_train = df[df.index < '2012-05-20']
df_test = df[df.index >= '2012-05-20']
sample_data = pd.DataFrame(df[df['Store_ID']==1]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data_train = sample_data[sample_data.index < '2012-05-20']
sample_data_test = sample_data[sample_data.index >= '2012-05-20']
sample_data.reset_index(inplace=True)
m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.fit(sample_data_train)
forecast = m.predict(sample_data_test)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
from fbprophet.plot import plot_plotly
import plotly.offline as py
py.init_notebook_mode()
fig = plot_plotly(m, forecast) # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
def make_comparison_dataframe(historical, forecast):
return forecast.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))
pd.plotting.register_matplotlib_converters()
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df.head()
cmp_df[['y', 'yhat']].plot()
mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values)
sample_data = pd.DataFrame(df[df['Store_ID']==6]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data_train = sample_data[sample_data.index < '2012-05-20']
sample_data_test = sample_data[sample_data.index >= '2012-05-20']
sample_data.reset_index(inplace=True)
m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.fit(sample_data_train)
forecast = m.predict(sample_data_test)
py.init_notebook_mode()
fig = plot_plotly(m, forecast) # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df.head()
cmp_df[['y', 'yhat']].plot()
sample_data_train['y']
cmp_df['yhat']
neighbour_1 = (sample_data_train['y']).append(cmp_df['yhat'], ignore_index=True)
There are several ways to augment the training data for prediction
Feature Augmentation - We predict store 6 using the base model then add the training value and predicted value as regressors for the store 1 prediction.
Instance Augmentation - We add the NN's data (difficult to determine where as the sequence is important)
Lets check the correlation coefficient between these time series as a sanity check.
np.corrcoef((df[df['Store_ID']==6]['Weekly_Sales']).values,(df[df['Store_ID']==1]['Weekly_Sales']).values)
sample_data = pd.DataFrame(df[df['Store_ID']==1]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data['neighbour_1'] = neighbour_1.values
train = sample_data[sample_data.index < '2012-05-20']
test = sample_data[sample_data.index >= '2012-05-20']
sample_data.reset_index(inplace=True)
sample_data['neighbour_1'] = neighbour_1.values
m = Prophet(yearly_seasonality=True)
m.add_country_holidays(country_name='US')
m.add_regressor('neighbour_1')
m.fit(train)
forecast = m.predict(test)
py.init_notebook_mode()
fig = plot_plotly(m, forecast) # This returns a plotly Figure
py.iplot(fig)
m.plot_components(forecast);
cmp_df = make_comparison_dataframe(sample_data, forecast)
cmp_df[['y', 'yhat']].plot()
mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values) #looks promising but we need to check if it works most cases
mae = []
store_identity = list(df.Store_ID.unique()[:])
base_prediction = []
def base_prophet_forecast(store_id, forecast_period):
df_train = df[df.index < forecast_period]
df_test = df[df.index >= forecast_period]
sample_data = pd.DataFrame(df[df['Store_ID']==store_id]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data_train = sample_data[sample_data.index < forecast_period]
sample_data_test = sample_data[sample_data.index >= forecast_period]
sample_data.reset_index(inplace=True)
m = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
m.add_country_holidays(country_name='US')
m.fit(sample_data_train)
forecast = m.predict(sample_data_test)
cmp_df = make_comparison_dataframe(sample_data, forecast)
mae.append(mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values))
base_prediction.append(cmp_df['yhat'].values)
for store in df.Store_ID.unique()[:]:
base_prophet_forecast(store, '2012-05-20')
sns.distplot(mae, bins=len(mae))
np.median(mae)
base_predictions = dict(zip(store_identity, base_prediction))
nn_df = pd.read_csv('Stores_NN.csv', index_col = 0)
nn_df.head()
base_predictions.get(4)
df['date'] = df.index
df = pd.merge(df, nn_df, on="Store_ID")
df.index = df['date']
df.head()
NN_mae = []
def NN_prophet_forecast(store_id, forecast_period):
sample_data = pd.DataFrame(df[df['Store_ID']==store_id]['Weekly_Sales'])
sample_data['ds'] = sample_data.index
sample_data['ds'] = sample_data.index
sample_data['y'] = sample_data['Weekly_Sales']
sample_data.drop('Weekly_Sales', axis=1, inplace=True)
sample_data['neighbour_1'] = np.concatenate(((df[df['Store_ID'] == df[df['Store_ID']==store_id]['1-NN'][0]]['Weekly_Sales'][:-24].values),
(base_predictions.get(df[df['Store_ID']==store_id]['1-NN'][0]))),axis=0)
sample_data_train = sample_data[sample_data.index < forecast_period]
sample_data_test = sample_data[sample_data.index >= forecast_period]
sample_data.reset_index(inplace=True)
m = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
m.add_country_holidays(country_name='US')
m.add_regressor('neighbour_1')
m.fit(sample_data_train)
forecast = m.predict(sample_data_test)
cmp_df = make_comparison_dataframe(sample_data, forecast)
NN_mae.append(mean_absolute_error(cmp_df['y'].values, cmp_df['yhat'].values))
for store in df.Store_ID.unique()[:]:
NN_prophet_forecast(store, '2012-05-20')
data = zip(mae,NN_mae)
sns.distplot(NN_mae, bins=len(NN_mae))
np.median(NN_mae)
results_df = pd.DataFrame(data, columns=['Base_Model','NN_Augmented'])
from collections import Counter
Counter(results_df[['Base_Model','NN_Augmented']].idxmin(axis=1))
For 26 stores the base model has a better prediction but for 19 stores the NN_Augmented approach works best. It will be useful to select if we want to use the Augmented approach by looking at the distance of the nearest neighbours as some may not be that close at all to the query.